# importing the requried packages
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.use
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# overview of the data
movie_data = pd.read_csv('tmdb-movies.csv')
movie_data.head(5)
movie_data.shape # getting the shape of the data
movie_data.info()
movie_data.columns
sum(movie_data.duplicated())
movie_data.drop_duplicates(inplace = True)
sum(movie_data.duplicated())
movie_data.isnull().sum()
movie_data.isnull().mean()
movie_data.dropna(inplace = True)
movie_data.shape
# data set contains extranrous columns like homepage, cast, tagline, overview.
movie_data.drop(['homepage', 'cast', 'tagline', 'overview'], axis = 1, inplace = True)
print(movie_data.columns)
print(movie_data.shape)
#changing release_data column to datetime format
movie_data['release_date'] = pd.to_datetime(movie_data['release_date'])
movie_data['release_date']
movie_data.corr()
movie_data.describe()
movie_data.head(5)
#movie_data['release_year'].value_counts()
data = movie_data.groupby('release_year').count()['id']
data
plt.xticks(np.arange(1960,2020,5))
plt.xlabel("Year.")
plt.ylabel("Number of Movies.")
plt.title("Number of Movies vs Released Year")
plt.plot(data)
plt.show()
#Slice DataFrame to get 2 columns 'vote_count' and 'vote_average'
df_vote = movie_data.loc[:, 'vote_count' : 'vote_average']
#To compare results only entries are considered with more than 2000 votes
df_vote_2000 = df_vote[df_vote['vote_count'] > 2000]
df_vote.tail()
sns.set_style('whitegrid')
sns.pairplot(df_vote[['vote_count','vote_average']])
df_vote.corr()
df_vote_2000.corr()
movie_data['profit'] = movie_data['revenue'] - movie_data['budget']
movie_data[['profit']].head()
def director_with_profit():
data = pd.DataFrame(movie_data[['director', 'profit']].sort_values(by = 'profit', ascending = False))
print(f'Director with highest profit {data.head().max()}')
sns.barplot(x= 'director', y= 'profit', palette="rocket", data = data[:20])
plt.xticks(rotation = 90)
plt.title('Highest profit director')
director_with_profit()
sns.set_style('darkgrid')
sns.lmplot(x="budget", y="revenue", data=movie_data);
plt.title('Budget vs Revenue')
data = movie_data[['popularity','revenue','budget','vote_count','vote_average','release_year']]
data.head()
sns.lineplot(x = data['release_year'], y = data['budget'], palette="tab10", linewidth=2.5)
sns.lineplot(x = data['release_year'], y = data['revenue'], palette="tab10",linewidth=2.5)
t = sns.pairplot(data[['revenue','release_year','vote_count','vote_average','budget']],hue='release_year');
t.fig.suptitle('Analysis on revenue, vote_count, vote_average', y = 1.05)
'''Looks like there is a positive correlation between budget and revenue, and a very slight positive correlation with release year and budget. --> With average rating slightly positive influenced by budget. These are only slight though, so the analysis here is limited. This does not indicate a causation in improvement in revenue/rating and a much deeper analysis would be required to find any correlation'''